utf8.rs•9.04 kB
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT License.
use std::{hint, iter};
/// An iterator over UTF-8 encoded characters.
///
/// This differs from [`std::str::Chars`] in that it works on unsanitized
/// byte slices and transparently replaces invalid UTF-8 sequences with U+FFFD.
///
/// This follows ICU's bitmask approach for `U8_NEXT_OR_FFFD` relatively
/// closely. This is important for compatibility, because it implements the
/// WHATWG recommendation for UTF8 error recovery. It's also helpful, because
/// the excellent folks at ICU have probably spent a lot of time optimizing it.
#[derive(Clone, Copy)]
pub struct Utf8Chars<'a> {
source: &'a [u8],
offset: usize,
}
impl<'a> Utf8Chars<'a> {
/// Creates a new `Utf8Chars` iterator starting at the given `offset`.
pub fn new(source: &'a [u8], offset: usize) -> Self {
Self { source, offset }
}
/// Returns the byte slice this iterator was created with.
pub fn source(&self) -> &'a [u8] {
self.source
}
/// Checks if the source is empty.
pub fn is_empty(&self) -> bool {
self.source.is_empty()
}
/// Returns the length of the source.
pub fn len(&self) -> usize {
self.source.len()
}
/// Returns the current offset in the byte slice.
///
/// This will be past the last returned character.
pub fn offset(&self) -> usize {
self.offset
}
/// Sets the offset to continue iterating from.
pub fn seek(&mut self, offset: usize) {
self.offset = offset;
}
/// Returns true if `next` will return another character.
pub fn has_next(&self) -> bool {
self.offset < self.source.len()
}
// I found that on mixed 50/50 English/Non-English text,
// performance actually suffers when this gets inlined.
#[cold]
fn next_slow(&mut self, c: u8) -> char {
if self.offset >= self.source.len() {
return Self::fffd();
}
let mut cp = c as u32;
if cp < 0xE0 {
// UTF8-2 = %xC2-DF UTF8-tail
if cp < 0xC2 {
return Self::fffd();
}
// The lead byte is 110xxxxx
// -> Strip off the 110 prefix
cp &= !0xE0;
} else if cp < 0xF0 {
// UTF8-3 =
// %xE0 %xA0-BF UTF8-tail
// %xE1-EC UTF8-tail UTF8-tail
// %xED %x80-9F UTF8-tail
// %xEE-EF UTF8-tail UTF8-tail
// This is a pretty neat approach seen in ICU4C, because it's a 1:1 translation of the RFC.
// I don't understand why others don't do the same thing. It's rather performant.
const BITS_80_9F: u8 = 1 << 0b100; // 0x80-9F, aka 0b100xxxxx
const BITS_A0_BF: u8 = 1 << 0b101; // 0xA0-BF, aka 0b101xxxxx
const BITS_BOTH: u8 = BITS_80_9F | BITS_A0_BF;
const LEAD_TRAIL1_BITS: [u8; 16] = [
// v-- lead byte
BITS_A0_BF, // 0xE0
BITS_BOTH, // 0xE1
BITS_BOTH, // 0xE2
BITS_BOTH, // 0xE3
BITS_BOTH, // 0xE4
BITS_BOTH, // 0xE5
BITS_BOTH, // 0xE6
BITS_BOTH, // 0xE7
BITS_BOTH, // 0xE8
BITS_BOTH, // 0xE9
BITS_BOTH, // 0xEA
BITS_BOTH, // 0xEB
BITS_BOTH, // 0xEC
BITS_80_9F, // 0xED
BITS_BOTH, // 0xEE
BITS_BOTH, // 0xEF
];
// The lead byte is 1110xxxx
// -> Strip off the 1110 prefix
cp &= !0xF0;
let t = self.source[self.offset] as u32;
if LEAD_TRAIL1_BITS[cp as usize] & (1 << (t >> 5)) == 0 {
return Self::fffd();
}
cp = (cp << 6) | (t & 0x3F);
self.offset += 1;
if self.offset >= self.source.len() {
return Self::fffd();
}
} else {
// UTF8-4 =
// %xF0 %x90-BF UTF8-tail UTF8-tail
// %xF1-F3 UTF8-tail UTF8-tail UTF8-tail
// %xF4 %x80-8F UTF8-tail UTF8-tail
// This is similar to the above, but with the indices flipped:
// The trail byte is the index and the lead byte mask is the value.
// This is because the split at 0x90 requires more bits than fit into an u8.
const TRAIL1_LEAD_BITS: [u8; 16] = [
// --------- 0xF4 lead
// | ...
// | +---- 0xF0 lead
// v v
0b_00000, //
0b_00000, //
0b_00000, //
0b_00000, //
0b_00000, //
0b_00000, //
0b_00000, // trail bytes:
0b_00000, //
0b_11110, // 0x80-8F -> 0x80-8F can be preceded by 0xF1-F4
0b_01111, // 0x90-9F -v
0b_01111, // 0xA0-AF -> 0x90-BF can be preceded by 0xF0-F3
0b_01111, // 0xB0-BF -^
0b_00000, //
0b_00000, //
0b_00000, //
0b_00000, //
];
// The lead byte *may* be 11110xxx, but could also be e.g. 11111xxx.
// -> Only strip off the 1111 prefix
cp &= !0xF0;
// Now we can verify if it's actually <= 0xF4.
// Curiously, this if condition does a lot of heavy lifting for
// performance (+13%). I think it's just a coincidence though.
if cp > 4 {
return Self::fffd();
}
let t = self.source[self.offset] as u32;
if TRAIL1_LEAD_BITS[(t >> 4) as usize] & (1 << cp) == 0 {
return Self::fffd();
}
cp = (cp << 6) | (t & 0x3F);
self.offset += 1;
if self.offset >= self.source.len() {
return Self::fffd();
}
// UTF8-tail = %x80-BF
let t = (self.source[self.offset] as u32).wrapping_sub(0x80);
if t > 0x3F {
return Self::fffd();
}
cp = (cp << 6) | t;
self.offset += 1;
if self.offset >= self.source.len() {
return Self::fffd();
}
}
// SAFETY: All branches above check for `if self.offset >= self.source.len()`
// one way or another. This is here because the compiler doesn't get it otherwise.
unsafe { hint::assert_unchecked(self.offset < self.source.len()) };
// UTF8-tail = %x80-BF
let t = (self.source[self.offset] as u32).wrapping_sub(0x80);
if t > 0x3F {
return Self::fffd();
}
cp = (cp << 6) | t;
self.offset += 1;
// SAFETY: If `cp` wasn't a valid codepoint, we already returned U+FFFD above.
unsafe { char::from_u32_unchecked(cp) }
}
// This simultaneously serves as a `cold_path` marker.
// It improves performance by ~5% and reduces code size.
#[cold]
#[inline(always)]
fn fffd() -> char {
'\u{FFFD}'
}
}
impl Iterator for Utf8Chars<'_> {
type Item = char;
#[inline]
fn next(&mut self) -> Option<Self::Item> {
if self.offset >= self.source.len() {
return None;
}
let c = self.source[self.offset];
self.offset += 1;
// Fast-passing ASCII allows this function to be trivially inlined everywhere,
// as the full decoder is a little too large for that.
if (c & 0x80) == 0 {
// UTF8-1 = %x00-7F
Some(c as char)
} else {
// Weirdly enough, adding a hint here to assert that `next_slow`
// only returns codepoints >= 0x80 makes `ucd` ~5% slower.
Some(self.next_slow(c))
}
}
#[inline]
fn size_hint(&self) -> (usize, Option<usize>) {
// Lower bound: All remaining bytes are 4-byte sequences.
// Upper bound: All remaining bytes are ASCII.
let remaining = self.source.len() - self.offset;
(remaining / 4, Some(remaining))
}
}
impl iter::FusedIterator for Utf8Chars<'_> {}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_broken_utf8() {
let source = [b'a', 0xED, 0xA0, 0x80, b'b'];
let mut chars = Utf8Chars::new(&source, 0);
let mut offset = 0;
for chunk in source.utf8_chunks() {
for ch in chunk.valid().chars() {
offset += ch.len_utf8();
assert_eq!(chars.next(), Some(ch));
assert_eq!(chars.offset(), offset);
}
if !chunk.invalid().is_empty() {
offset += chunk.invalid().len();
assert_eq!(chars.next(), Some('\u{FFFD}'));
assert_eq!(chars.offset(), offset);
}
}
}
}